In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import seaborn as sns
import numpy as np

import scipy.cluster.hierarchy as shc

from sklearn.preprocessing import MinMaxScaler

from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

from sklearn.metrics import confusion_matrix
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples

from sklearn.decomposition import PCA

from sklearn import datasets

%matplotlib inline
pd.set_option("display.max_columns", None)

# Lab 23 - Silhouette score revisited and Principal Components Analysis

Load the iris dataset, as in previous labs.

In [None]:
iris_dict = datasets.load_iris()

iris = pd.DataFrame(iris_dict.data, columns = iris_dict.feature_names)
iris.head()

Scale the columns of the iris dataset.

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
iris_scaled = scaler.fit_transform(iris)

Below is code to compute the silhouette coefficient for each data point, with the number of clusters ranging from 2 to 6.

In [None]:
for k in range(2,7):
 # Create a subplot with 1 row and 2 columns
 fig, (ax1, ax2) = plt.subplots(1, 2)
 fig.set_size_inches(18, 7)

 # The 1st subplot is the silhouette plot
 # The silhouette coefficient can range from -1, 1 but in this example all
 # lie within [-0.1, 1]
 ax1.set_xlim([-1, 1])
 # The (n_clusters+1)*10 is for inserting blank space between silhouette
 # plots of individual clusters, to demarcate them clearly.
 ax1.set_ylim([0, len(iris_scaled) + (k + 1) * 10])

 # Initialize the clusterer with n_clusters value and a random generator
 # seed of 10 for reproducibility.
 clusterer = KMeans(n_clusters=k)
 cluster_labels = clusterer.fit_predict(iris_scaled)

 # The silhouette_score gives the average value for all the samples.
 # This gives a perspective into the density and separation of the formed
 # clusters
 silhouette_avg = silhouette_score(iris_scaled, cluster_labels)
 print("For n_clusters =", k,
 "The average silhouette_score is :", silhouette_avg)

 # Compute the silhouette scores for each sample
 sample_silhouette_values = silhouette_samples(iris_scaled, cluster_labels)

 y_lower = 10
 for i in range(k):
 # Aggregate the silhouette scores for samples belonging to
 # cluster i, and sort them
 ith_cluster_silhouette_values = \
 sample_silhouette_values[cluster_labels == i]

 ith_cluster_silhouette_values.sort()

 size_cluster_i = ith_cluster_silhouette_values.shape[0]
 y_upper = y_lower + size_cluster_i

 color = cm.nipy_spectral(float(i) / k)
 ax1.fill_betweenx(np.arange(y_lower, y_upper),
 0, ith_cluster_silhouette_values,
 facecolor=color, edgecolor=color, alpha=0.7)

 # Label the silhouette plots with their cluster numbers at the middle
 ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

 # Compute the new y_lower for next plot
 y_lower = y_upper + 10 # 10 for the 0 samples

 ax1.set_title("The silhouette plot for the various clusters.")
 ax1.set_xlabel("The silhouette coefficient values")
 ax1.set_ylabel("Cluster label")

 # The vertical line for average silhouette score of all the values
 ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

 ax1.set_yticks([]) # Clear the yaxis labels / ticks
 ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])


 plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
 "with n_clusters = %d" % k),
 fontsize=14, fontweight='bold')


## Principal Components Analysis

Principal Components Analysis or PCA reduces the dimensions of the data set by successively finding the directions with the most variation and using these directions as the new coordinate system.

The following code finds the first two principal components for the iris dataset:

In [None]:
pca = PCA(n_components=2)
pca.fit(iris)
iris_rotated = pca.transform(iris)

Let's create a new dataframe with the principal components and the species names.

In [None]:
iris_rotated_df = pd.DataFrame(iris_rotated, columns = ["PC1","PC2"])
iris_rotated_df["Species"] = iris_dict.target
iris_rotated_df.head()

Display this dataframe.

Use Seaborn to plot a scatter plot where x is PC1 and y is PC2, with the points colored by the species.

Let's compare this scatter plot with all possible scatter plots from the original data. 

First create a new dataframe with the original iris data and the species names.

Now use pairplot() to plot the scatter plots. hue will also work as a parameter here.

Load the labor dataset.

In [None]:
labor = pd.read_csv("../data/Nov2019_labor_market_majors.csv", skiprows = 13, \
 skipfooter = 3, index_col = "Major")
labor["Median Wage Early Career"] = labor["Median Wage Early Career"].str.replace(",","").astype(float)
labor["Median Wage Mid-Career"] = labor["Median Wage Mid-Career"].str.replace(",","").astype(float)

In [None]:
labor.head()

Transform the labor data.

Put the data into a dataframe.

In [None]:
labor_scaled = pd.DataFrame(labor_scaled,columns = labor.columns, index = labor.index)

Find the first two principal components using the unscaled labor data.

Use k-means to cluster the scaled labor data.

Create a new dataframe with the principal components and the cluster labels.

Plot the colored-by-cluster data using the principal component coordinates.

We can even find the first three principal components and plot them in 3D.

In [None]:
pca2 = PCA(n_components=3)
pca2.fit(labor)
labor2_rotated = pca2.transform(labor)

In [None]:
labor2_rotated_df = pd.DataFrame(labor2_rotated,columns = ["PC1","PC2","PC3"])
labor2_rotated_df["Cluster"] = kmeans_clusters

In [None]:
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.scatter3D(labor2_rotated_df["PC1"], labor2_rotated_df["PC2"], labor2_rotated_df["PC3"]);